# Alex F. Gazmararian
# agazmararian@gmail.com

library(tidyverse)
library(here)
library(tidylog)

#' Calculate agreement metrics between two binary variables
#' 
#' @param x First binary variable (0/1 or TRUE/FALSE)
#' @param y Second binary variable (0/1 or TRUE/FALSE)
#' @param na.rm Logical, whether to remove NA values before calculating (default TRUE)
#' @return A list containing agreement metrics
calc_agreement <- function(x, y, na.rm = TRUE) {
  # Convert to numeric if logical
  x <- as.numeric(x)
  y <- as.numeric(y)
  
  # Handle NAs
  if (na.rm) {
    complete <- complete.cases(x, y)
    x <- x[complete]
    y <- y[complete]
  }
  
  # Calculate basic counts
  n_total <- length(x)
  n_agree <- sum(x == y)
  n_disagree <- sum(x != y)
  
  # Create contingency table
  cont_table <- table(x, y, dnn = c("x", "y"))
  
  # Calculate percentage agreement by category
  # For each value (0,1), what % of cases agree
  pct_by_cat <- list()
  for(val in sort(unique(c(x,y)))) {
    cases <- x == val | y == val
    if(sum(cases) > 0) {
      pct_by_cat[[as.character(val)]] <- sum(x[cases] == y[cases]) / sum(cases)
    }
  }
  
  # Return results
  list(
    pct_agreement = n_agree / n_total,
    n_agree = n_agree,
    n_disagree = n_disagree,
    n_total = n_total,
    table = cont_table,
    pct_by_category = pct_by_cat
  )
}

#' Print agreement metrics in a formatted way
#' 
#' @param agreement_results Output from calc_agreement function
#' @return Prints formatted results
print_agreement <- function(agreement_results) {
  message("Agreement Summary:")
  message(sprintf("Overall Agreement: %.1f%%", agreement_results$pct_agreement * 100))
  message(sprintf("Total Cases: %d", agreement_results$n_total))
  message(sprintf("Agree: %d, Disagree: %d", 
                 agreement_results$n_agree, 
                 agreement_results$n_disagree))
  
  message("\nContingency Table:")
  print(agreement_results$table)
  
  message("\nAgreement by Category:")
  for(cat in names(agreement_results$pct_by_category)) {
    message(sprintf("Category %s: %.1f%%", 
                   cat, 
                   agreement_results$pct_by_category[[cat]] * 100))
  }
}

# Load and prepare data
dylan <- read_csv(here("data", "input", "credit", "annotation_qc", "dylan_qc.csv"), show_col_types = FALSE)
dylan$coder <- "dylan"
dylan$id <- as.character(dylan$id)
ahmed <- read_csv(here("data", "input", "credit", "annotation_qc", "ahmed_qc.csv"), show_col_types = FALSE)
ahmed$coder <- "ahmed"
ahmed$id <- as.character(ahmed$id)
branden <- read_csv(here("data", "input", "credit", "annotation_qc", "branden_qc.csv"), show_col_types = FALSE)
branden$coder <- "branden"
branden$id <- as.character(branden$id)
qc <- bind_rows(dylan, ahmed, branden)
qc <- qc %>%
    rename_with(~ paste0(., "_qc"), gives_credit:credit_bil)
qc <- subset(qc, select = -c(id, note, actor, statement))

g <- read_csv(here("data", "inter", "annotated_statements.csv"), show_col_types = FALSE)
g <- filter(g, statement_id %in% qc$statement_id)

g <- left_join(g, qc, by = "statement_id")

# Check agreement for all credit variables
calc_agreement(g$gives_credit, g$gives_credit_qc)
g[g$gives_credit != g$gives_credit_qc, ]

calc_agreement(g$credit_biden, g$credit_biden_qc)

calc_agreement(g$credit_senate, g$credit_senate_qc)
g[g$credit_senate != g$credit_senate_qc, ]

calc_agreement(g$credit_governor, g$credit_governor_qc)

calc_agreement(g$credit_us_rep, g$credit_us_rep_qc)
g[g$credit_us_rep != g$credit_us_rep_qc, ]

calc_agreement(g$credit_local, g$credit_local_qc)

calc_agreement(g$credit_dem, g$credit_dem_qc)

calc_agreement(g$credit_gop, g$credit_gop_qc)

calc_agreement(g$credit_ira, g$credit_ira_qc)

calc_agreement(g$credit_bil, g$credit_bil_qc)
